Preprocessing QC statistics

Noam, July 2023

In [81]:
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [138]:
import os
MOMAPS_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(MOMAPS_HOME, 'src', 'preprocessing', 'logs','deltaNLS')
PLOT_PATH = os.path.join(MOMAPS_HOME, 'src', 'preprocessing', 'notebooks','figures','delta_NLS')
os.chdir(MOMAPS_HOME)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.5)
sns.color_palette("husl", 8)
plt.rcParams["image.cmap"] = "Set1"
from tqdm.notebook import tqdm
from src.common.lib.preprocessing_utils import rescale_intensity
from src.common.lib.images_qc import *
sys.path.insert(1, "/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/src/common/lib")
import contextlib
import io
import matplotlib
import warnings
warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)
from src.common.lib.qc_config_tmp import *
In [109]:
df = log_files_qc(LOGS_PATH)
Total of 5 files were read.
Before dup handeling  (32524, 20)
After duplication removal #1: (32405, 21)
After duplication removal #2: (32256, 21)

validate folder structure and files existence

In [96]:
# choose batches
batches = [f'batch{i}' for i in range (2,6)]
#batches=['batch5']

Raw Files

In [97]:
root_directory_raw = os.path.join(MOMAPS_HOME, 'input', 'images', 'raw', 'SpinningDisk','deltaNLS_sort')

batches_raw = [batch.replace("_16bit","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, dnls_panels, dnls_markers,PLOT_PATH,
                                     dnls_marker_info,dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp,
                                     dnls_expected_dapi_raw, batches=batches_raw,fig_width=4, fig_height=10)
batch2
Folder structure is valid.
All files exists.
========
batch3
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/WT/panelN
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/TDP43/panelN
All files exists.
========
batch4
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/WT/panelN
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/TDP43/panelN
All files exists.
========
batch5
Folder structure is valid.
All files exists.
========
====================

Processed

In [98]:
root_directory_proc = os.path.join(MOMAPS_HOME, 'input', 'images', 'processed', 'spd2',
                              'SpinningDisk','deltaNLS')
procs = run_validate_folder_structure(root_directory_proc, True, dnls_panels, dnls_markers,PLOT_PATH,
                                     dnls_marker_info,dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp,
                                     dnls_expected_dapi_raw, batches=batches_raw, fig_width=4, fig_height=10)
batch2
Folder structure is valid.
All files exists.
========
batch3
Folder structure is valid.
All files exists.
========
batch4
Folder structure is valid.
All files exists.
========
batch5
Folder structure is valid.
All files exists.
========
====================

Difference between Raw and Processed

In [100]:
display_diff(batches, raws, procs, PLOT_PATH, 10,4)
batch2
========
batch3
========
batch4
========
batch5
========
In [101]:
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, sample_size_per_markers=200, num_markers=26)
    print(f'{batch} var: ',var)
    
batch2 var:  0.007057278003015998
batch3 var:  0.006750294806499788
batch4 var:  0.006772565814863787
batch5 var:  0.006584438495472644

Number of sites in each batch and cell line

In [119]:
plot_sites_count(df, dnls_expected_raw, dnls_lines_order, dnls_custom_palette, split_to_reps=True)

Number of Cells in Site for each batch and cell line

In [120]:
df_no_empty_sites = df[df.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, whole_cells=True)

plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, whole_cells=False)

# can add norm=True to norm by max

number of valid tiles per image (site)

In [131]:
plot_n_valid_tiles_count(df, custom_palette, reps, batch_min=2, batch_max=5)

Heatmap QC per batch, panel and cell line(tiles that passed QC condition)

In [142]:
plot_hm(df, split_by='rep', rows='cell_line', columns='panel', fig_height=2)

Assessing Staining Reproducibility and Outliers

In [190]:
dnls_cell_lines_for_disp
Out[190]:
{'WT_Untreated': 'WT_Untreated',
 'TDP43_dox': 'TDP43_dox',
 'TDP43_Untreated': 'TDP43_Untreated'}
In [182]:
 
Out[182]:
14
In [208]:
# for batch in [batches[0]]:
#     print(batch)
#     raw_df = run_calc_hist_new(f'deltaNLS_sort/{batch}',dnls_cell_lines_for_disp,
#                       dnls_markers,num_markers=((len(dnls_markers)-1)+dnls_panels.shape[1])*2,
#                              show_cond=True,sample_size_per_markers=2)
#     print("="*30)
    
batch2


[sample_images_all_markers_all_lines]: input_dir_batch:/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch2, _sample_size_per_markers:2, _num_markers:88
sampled_images: 352 sampled_markers: 176
sampled_images: 180 sampled_markers: 90


[sample_images_all_markers_all_lines]: input_dir_batch:/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS/batch2, _sample_size_per_markers:4, _num_markers:31
sampled_images: 248 sampled_markers: 62
sampled_images: 124 sampled_markers: 31


Total of 532 images were sampled for hist calculation.


Total of 372 images were sampled for hist calculation.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_119172/166820330.py in <module>
      3     raw_df = run_calc_hist_new(f'deltaNLS_sort/{batch}',dnls_cell_lines_for_disp,
      4                       dnls_markers,num_markers=((len(dnls_markers)-1)+dnls_panels.shape[1])*2,
----> 5                              show_cond=True,sample_size_per_markers=2)
      6     print("="*30)
      7 

/home/labs/hornsteinlab/Collaboration/MOmaps/src/common/lib/images_qc.py in run_calc_hist_new(batch, cell_lines_for_disp, markers, show_cond, sample_size_per_markers, num_markers)
    886     batch_df_processed = multiproc_calc_hists_per_batch_proc(images_proc, batch_df_processed, show_cond)
    887     #return batch_df_raw, batch_df_norm, batch_df_processed
--> 888     plot_hists(batch_df_raw, batch_df_norm, batch_df_processed, batch)
    889 
    890 

/home/labs/hornsteinlab/Collaboration/MOmaps/src/common/lib/images_qc.py in plot_hists(batch_df_raw, batch_df_norm, batch_df_proc, batch_num, plot_sep_by_cell_line)
    854 def plot_hists(batch_df_raw,batch_df_norm, batch_df_proc, batch_num, plot_sep_by_cell_line=False ):
    855     mean_hist_raw = batch_df_raw.copy()
--> 856     mean_hist_raw[batch_df_raw.columns.difference(['site_count'])] = batch_df_raw.drop(columns=['site_count']).div(batch_df_raw['site_count'], axis=0).astype(int)
    857 
    858     mean_hist_rescale = batch_df_norm.copy()

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5813         else:
   5814             # else, only a single dtype is given
-> 5815             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816             return self._constructor(new_data).__finalize__(self, method="astype")
   5817 

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    416 
    417     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    419 
    420     def convert(

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    325                     applied = b.apply(f, **kwargs)
    326                 else:
--> 327                     applied = getattr(b, f)(**kwargs)
    328             except (TypeError, NotImplementedError):
    329                 if not ignore_failures:

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    589         values = self.values
    590 
--> 591         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    592 
    593         new_values = maybe_coerce_values(new_values)

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1307 
   1308     try:
-> 1309         new_values = astype_array(values, dtype, copy=copy)
   1310     except (ValueError, TypeError):
   1311         # e.g. astype_nansafe can fail on object-dtype of strings

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1255 
   1256     else:
-> 1257         values = astype_nansafe(values, dtype, copy=copy)
   1258 
   1259     # in pandas we don't store numpy str dtypes, so convert to object

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1093     if arr.ndim > 1:
   1094         flat = arr.ravel()
-> 1095         result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna)
   1096         # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no
   1097         # attribute "reshape"

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1172         # work around NumPy brokenness, #1987
   1173         if np.issubdtype(dtype.type, np.integer):
-> 1174             return lib.astype_intsafe(arr, dtype)
   1175 
   1176         # if we have a datetime/timedelta array of objects

/home/labs/hornsteinlab/Collaboration/MOmaps/anaconda3/momaps_37/lib/python3.7/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

ValueError: cannot convert float NaN to integer
In [ ]:
print(os.system('pwd'))
print("Done!")
In [ ]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system('jupyter nbconvert --to html src/preprocessing/notebooks/cell_count_stats_analysis_dnls.ipynb')
In [ ]: